Upload the dataframes and alignment (in FASTA) to calculate the pairwise genetic distance matrix:

library("ape")
library("seqinr") 
## 
## Attaching package: 'seqinr'
## The following objects are masked from 'package:ape':
## 
##     as.alignment, consensus
library("FactoMineR") 
library("factoextra") 
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library("corrplot") 
## corrplot 0.92 loaded
RVAdistdf <- read.csv("RVAdistdf.csv", row.names=1) #the pairwise genetic distance dataframe
RVAmetadf <-read.csv("RVA-PCA_metadata.csv", row.names=1) #the metadata associated to sequences/individuals
RVAmetadf[RVAmetadf ==''] <- NA
RVAmetadf$AgeGroup <- as.factor(RVAmetadf$AgeGroup)
RVAmetadf$Genotype <- as.factor(RVAmetadf$Genotype)
RVAmetadf$ZIPCODE <- as.factor(RVAmetadf$ZIPCODE)
RVAmetadf$sex <- as.factor(RVAmetadf$sex)
RVAmetadf$COLLECTION <- as.factor(RVAmetadf$COLL_MONTH)
RVAmetadf$Conclusion <- as.factor(RVAmetadf$Conclusion)
RVAmetadf$age <- as.numeric(RVAmetadf$age)

Calculate the eigenvalues and principal component analysis:

#calculating the PCA
res.RVA <- PCA(RVAdistdf, scale.unit = TRUE, ncp = 4)

eig.val <- get_eigenvalue(res.RVA)

#To help determining the number of Principal Components we can see the Scree Plot, which is the plot of the eigenvalues ordered from largest to the smallest.
fviz_eig(res.RVA, addlabels = TRUE, ylim = c(0, 50))

Analysis of quality and contribution of individuals:

fviz_pca_ind(res.RVA, col.ind = "cos2", pointsize = "cos2", gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), labelsize=2)

#To visualize the contribution of individuals to the first two principal components:
fviz_contrib(res.RVA, choice = "ind", axes = 1:2, labelsize=2) + theme(text = element_text(size=7))

Evaluating the PCA by clustering based on metadata information:

#Individuals by Age Group
fviz_pca_ind(res.RVA, geom.ind = "point", col.ind = RVAmetadf$AgeGroup, palette = "lancet", addEllipses = TRUE, legend.title = "Age Group")

#Individuals by Symptoms
fviz_pca_ind(res.RVA, geom.ind = "point", col.ind = RVAmetadf$Conclusion, palette = "lancet", addEllipses = TRUE, legend.title = "Symptoms")
## Warning: Removed 19 rows containing missing values (`geom_point()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).

#Individuals by month and year of collection
fviz_pca_ind(res.RVA, geom.ind = "point", col.ind = RVAmetadf$COLLECTION, palette = "lancet", addEllipses = TRUE, legend.title = "Collection Month")
## Too few points to calculate an ellipse

#Individuals by Zip Code
fviz_pca_ind(res.RVA, geom.ind = "point", col.ind = RVAmetadf$ZIPCODE, addEllipses = TRUE, legend.title = "Zip Code")
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse

#Individuals by Gender
fviz_pca_ind(res.RVA, geom.ind = "point", col.ind = RVAmetadf$sex, palette = "lancet", addEllipses = TRUE, legend.title = "Sex")

Constructing a dendrogram and making the hierarchical clustering:

library("dendextend")
## 
## ---------------------
## Welcome to dendextend version 1.17.1
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags: 
##   https://stackoverflow.com/questions/tagged/dendextend
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------
## 
## Attaching package: 'dendextend'
## The following objects are masked from 'package:ape':
## 
##     ladderize, rotate
## The following object is masked from 'package:stats':
## 
##     cutree
library("svglite")
RVAdm<-dist(RVAdistdf, method = 'euclidean') 
RVAhc<-hclust(RVAdm, method="complete") # simple dendrogram

#evaluating the number of clusters con average silhouette:
fviz_nbclust(RVAdistdf, FUNcluster=hcut, method="silhouette", k.max = 10)

fviz_nbclust(RVAdistdf, FUNcluster=hcut, method="wss")

cut_cmp <- cutree(RVAhc, k = 4) #applying the clustering by wss
plot(RVAhc, hang=-1, cex=0.2) 
#plot with cluster rectangles within the clusters:
rect.hclust(RVAhc, k=4, border=2:4)

#save plot of dendrogram:
#svg(filename = "RVA_HC_PCA_euclidean.svg")
#plot(RVAhc, hang=-1, cex=0.2) 
#rect.hclust(RVAhc, k=4, border=2:4) #if I want to see the clusters identified by a rectangle
#dev.off()

Evaluate the association of metadata with the clusters by hierarchical clustering:

#make a dataframe with the clusters and evaluating association with metadata using G test of independence between groups:
library("dplyr")
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:seqinr':
## 
##     count
## The following object is masked from 'package:ape':
## 
##     where
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library("rcompanion")
library("tidyverse")
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0
## ✔ readr     2.1.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::count()  masks seqinr::count()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::where()  masks ape::where()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library("ggpubr")
## 
## Attaching package: 'ggpubr'
## 
## The following object is masked from 'package:dendextend':
## 
##     rotate
## 
## The following object is masked from 'package:ape':
## 
##     rotate
library("rstatix")
## 
## Attaching package: 'rstatix'
## 
## The following object is masked from 'package:stats':
## 
##     filter
library("AMR")
RVA_cluster <- merge(RVAmetadf, cut_cmp, by = 0)
RVA_cluster$y<-as.factor(RVA_cluster$y)

g.test(RVA_cluster$Conclusion, RVA_cluster$y)
## 
##  G-test of independence
## 
## data:  RVA_cluster$Conclusion and RVA_cluster$y
## X-squared = 1.1482, p-value = 0.7654
X1 = pairwiseNominalIndependence(as.matrix(table(RVA_cluster$Conclusion, RVA_cluster$y)), compare = "column", fisher = FALSE, chisq = FALSE, method = "bonferroni", digits = 3)
cldList(p.adj.Gtest ~ Comparison, data= X1, threshold  = 0.05)
##   Group Letter MonoLetter
## 1     1      a          a
## 2     2      a          a
## 3     3      a          a
## 4     4      a          a
ggplot(RVA_cluster, aes(x=Conclusion, fill=y)) + geom_bar(position = "fill")

g.test(RVA_cluster$ZIPCODE, RVA_cluster$y)
## Warning in g.test(RVA_cluster$ZIPCODE, RVA_cluster$y): G-statistic
## approximation may be incorrect due to E < 5
## 
##  G-test of independence
## 
## data:  RVA_cluster$ZIPCODE and RVA_cluster$y
## X-squared = 61.369, p-value = 0.05259
X2 = pairwiseNominalIndependence(as.matrix(table(RVA_cluster$ZIPCODE, RVA_cluster$y)), compare = "column", fisher = FALSE, chisq = FALSE, method = "bonferroni", digits = 3)
cldList(p.adj.Gtest ~ Comparison, data= X2, threshold  = 0.05)
##   Group Letter MonoLetter
## 1     1      a          a
## 2     2      a          a
## 3     3      a          a
## 4     4      a          a
ggplot(RVA_cluster, aes(x=ZIPCODE, fill=y)) + geom_bar(position = "fill") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) 

X3 = pairwiseNominalIndependence(as.matrix(table(RVA_cluster$sex, RVA_cluster$y)), compare = "column", fisher = FALSE, chisq = FALSE, method = "bonferroni", digits = 3)
cldList(p.adj.Gtest ~ Comparison, data= X3, threshold  = 0.05)
##   Group Letter MonoLetter
## 1     1      a          a
## 2     2      a          a
## 3     3      a          a
## 4     4      a          a
ggplot(RVA_cluster, aes(x=sex, fill=y)) + geom_bar(position = "fill")

X4 = pairwiseNominalIndependence(as.matrix(table(RVA_cluster$AgeGroup, RVA_cluster$y)), compare = "column", fisher = FALSE, chisq = FALSE, method = "bonferroni", digits = 3) 
cldList(p.adj.Gtest ~ Comparison, data= X4, threshold  = 0.05)
##   Group Letter MonoLetter
## 1     1     ab         ab
## 2     2      a         a 
## 3     3      b          b
## 4     4      a         a
ggplot(RVA_cluster, aes(x=AgeGroup, fill=y)) + geom_bar(position = "fill") + scale_x_discrete(limits = c("less5", "5-17", "18-64", "more65"))

#Further analysis with age as continuous variable:
res.kruskal <- RVA_cluster %>% kruskal_test(age ~ y) #Kruskal-Wallis Test
res.kruskal
## # A tibble: 1 × 6
##   .y.       n statistic    df     p method        
## * <chr> <int>     <dbl> <int> <dbl> <chr>         
## 1 age     258      3.35     3 0.341 Kruskal-Wallis
#Wilcoxon’s test to calculate pairwise comparisons between group:
pwc <- RVA_cluster %>% 
  wilcox_test(age ~ y, p.adjust.method = "bonferroni") 
pwc
## # A tibble: 6 × 9
##   .y.   group1 group2    n1    n2 statistic     p p.adj p.adj.signif
## * <chr> <chr>  <chr>  <int> <int>     <dbl> <dbl> <dbl> <chr>       
## 1 age   1      2         35   106     1560. 0.159 0.954 ns          
## 2 age   1      3         35    38      666  0.996 1     ns          
## 3 age   1      4         35    79     1209  0.288 1     ns          
## 4 age   2      3        106    38     2312  0.177 1     ns          
## 5 age   2      4        106    79     4372  0.608 1     ns          
## 6 age   3      4         38    79     1312  0.272 1     ns
#visualization: box plots with p-values:
pwc <- pwc %>% add_xy_position(x = "y")
ggboxplot(RVA_cluster, x = "y", y = "age") +
  stat_pvalue_manual(pwc, hide.ns = TRUE) +
  labs(
    subtitle = get_test_label(res.kruskal, detailed = TRUE),
    caption = get_pwc_label(pwc)
    )